• Load the dataset and perform initial data exploration. • Summarize key statistics, including measures of central tendency and dispersion. • Visualize data distributions (e.g., histograms) for relevant features. • Identify any missing values and suggest appropriate strategies for handling them. • Explore relationships between variables (e.g., correlation matrix, scatter plots).
1.1) Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import xgboost as xg
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")
data=pd.read_csv("C:\\Users\\Dell\\Downloads\\cardio.csv")
data.head()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
| 1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
| 2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
| 3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
| 4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
Data Preprocessing: • Prepare the data for modeling by addressing missing values and data quality issues.
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 303 entries, 0 to 302 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 303 non-null int64 1 sex 303 non-null int64 2 cp 303 non-null int64 3 trestbps 303 non-null int64 4 chol 303 non-null int64 5 fbs 303 non-null int64 6 restecg 303 non-null int64 7 thalach 303 non-null int64 8 exang 303 non-null int64 9 oldpeak 303 non-null float64 10 slope 303 non-null int64 11 ca 303 non-null int64 12 thal 303 non-null int64 13 target 303 non-null int64 dtypes: float64(1), int64(13) memory usage: 33.3 KB
missing_values = data.isnull().sum()
missing_values
age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalach 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 target 0 dtype: int64
data.shape
(303, 14)
data.columns
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
dtype='object')
data.dtypes
age int64 sex int64 cp int64 trestbps int64 chol int64 fbs int64 restecg int64 thalach int64 exang int64 oldpeak float64 slope int64 ca int64 thal int64 target int64 dtype: object
data.describe()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 |
| mean | 54.366337 | 0.683168 | 0.966997 | 131.623762 | 246.264026 | 0.148515 | 0.528053 | 149.646865 | 0.326733 | 1.039604 | 1.399340 | 0.729373 | 2.313531 | 0.544554 |
| std | 9.082101 | 0.466011 | 1.032052 | 17.538143 | 51.830751 | 0.356198 | 0.525860 | 22.905161 | 0.469794 | 1.161075 | 0.616226 | 1.022606 | 0.612277 | 0.498835 |
| min | 29.000000 | 0.000000 | 0.000000 | 94.000000 | 126.000000 | 0.000000 | 0.000000 | 71.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 47.500000 | 0.000000 | 0.000000 | 120.000000 | 211.000000 | 0.000000 | 0.000000 | 133.500000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 |
| 50% | 55.000000 | 1.000000 | 1.000000 | 130.000000 | 240.000000 | 0.000000 | 1.000000 | 153.000000 | 0.000000 | 0.800000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 |
| 75% | 61.000000 | 1.000000 | 2.000000 | 140.000000 | 274.500000 | 0.000000 | 1.000000 | 166.000000 | 1.000000 | 1.600000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 |
| max | 77.000000 | 1.000000 | 3.000000 | 200.000000 | 564.000000 | 1.000000 | 2.000000 | 202.000000 | 1.000000 | 6.200000 | 2.000000 | 4.000000 | 3.000000 | 1.000000 |
def Sex(a):
if a == 0:
return "Female"
else:
return "Male"
data['Sex_S'] = data['sex'].apply(Sex)
def Target(a):
if a == 0:
return "Heart Disease"
else:
return "No Heart Disease"
data['Target_S'] = data['target'].apply(Target)
data.corr()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 1.000000 | -0.098447 | -0.068653 | 0.279351 | 0.213678 | 0.121308 | -0.116211 | -0.398522 | 0.096801 | 0.210013 | -0.168814 | 0.276326 | 0.068001 | -0.225439 |
| sex | -0.098447 | 1.000000 | -0.049353 | -0.056769 | -0.197912 | 0.045032 | -0.058196 | -0.044020 | 0.141664 | 0.096093 | -0.030711 | 0.118261 | 0.210041 | -0.280937 |
| cp | -0.068653 | -0.049353 | 1.000000 | 0.047608 | -0.076904 | 0.094444 | 0.044421 | 0.295762 | -0.394280 | -0.149230 | 0.119717 | -0.181053 | -0.161736 | 0.433798 |
| trestbps | 0.279351 | -0.056769 | 0.047608 | 1.000000 | 0.123174 | 0.177531 | -0.114103 | -0.046698 | 0.067616 | 0.193216 | -0.121475 | 0.101389 | 0.062210 | -0.144931 |
| chol | 0.213678 | -0.197912 | -0.076904 | 0.123174 | 1.000000 | 0.013294 | -0.151040 | -0.009940 | 0.067023 | 0.053952 | -0.004038 | 0.070511 | 0.098803 | -0.085239 |
| fbs | 0.121308 | 0.045032 | 0.094444 | 0.177531 | 0.013294 | 1.000000 | -0.084189 | -0.008567 | 0.025665 | 0.005747 | -0.059894 | 0.137979 | -0.032019 | -0.028046 |
| restecg | -0.116211 | -0.058196 | 0.044421 | -0.114103 | -0.151040 | -0.084189 | 1.000000 | 0.044123 | -0.070733 | -0.058770 | 0.093045 | -0.072042 | -0.011981 | 0.137230 |
| thalach | -0.398522 | -0.044020 | 0.295762 | -0.046698 | -0.009940 | -0.008567 | 0.044123 | 1.000000 | -0.378812 | -0.344187 | 0.386784 | -0.213177 | -0.096439 | 0.421741 |
| exang | 0.096801 | 0.141664 | -0.394280 | 0.067616 | 0.067023 | 0.025665 | -0.070733 | -0.378812 | 1.000000 | 0.288223 | -0.257748 | 0.115739 | 0.206754 | -0.436757 |
| oldpeak | 0.210013 | 0.096093 | -0.149230 | 0.193216 | 0.053952 | 0.005747 | -0.058770 | -0.344187 | 0.288223 | 1.000000 | -0.577537 | 0.222682 | 0.210244 | -0.430696 |
| slope | -0.168814 | -0.030711 | 0.119717 | -0.121475 | -0.004038 | -0.059894 | 0.093045 | 0.386784 | -0.257748 | -0.577537 | 1.000000 | -0.080155 | -0.104764 | 0.345877 |
| ca | 0.276326 | 0.118261 | -0.181053 | 0.101389 | 0.070511 | 0.137979 | -0.072042 | -0.213177 | 0.115739 | 0.222682 | -0.080155 | 1.000000 | 0.151832 | -0.391724 |
| thal | 0.068001 | 0.210041 | -0.161736 | 0.062210 | 0.098803 | -0.032019 | -0.011981 | -0.096439 | 0.206754 | 0.210244 | -0.104764 | 0.151832 | 1.000000 | -0.344029 |
| target | -0.225439 | -0.280937 | 0.433798 | -0.144931 | -0.085239 | -0.028046 | 0.137230 | 0.421741 | -0.436757 | -0.430696 | 0.345877 | -0.391724 | -0.344029 | 1.000000 |
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(), annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')
<AxesSubplot:>
As from the above corelation matrix we can conclude that the target variable that the person has a cardiovasularity shows a strong negative correlation with ('exang','oldpeak','ca','thal') and weak negative correlation with ('chol','fbs'.'trestbps').
sns.scatterplot(data=data,x='Sex_S',hue='Target_S')
<AxesSubplot:xlabel='Sex_S'>
X=data.drop('target',axis=1)
y=data['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20)
X=data[['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']]
y=data['target']
X.head()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 |
| 1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 |
| 2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 |
| 3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 |
| 4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 |
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.20,random_state=20)
print(X_train.shape,X_test.shape)
(242, 13) (61, 13)
print(Y_train.shape,Y_test.shape)
(242,) (61,)
X_train.shape
(242, 13)
y_train.shape
(242,)
X_test.shape
(61, 13)
y_test.shape
(61,)
from sklearn.linear_model import LogisticRegression
logre=LogisticRegression()
logre.fit(X_train,y_train)
Y_pred_logre=logre.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score_logre=accuracy_score(Y_pred_logre,Y_test)*100
y_pred = logre.predict(X_test)
actual = []
predcition = []
for i,j in zip(y_test,y_pred):
actual.append(i)
predcition.append(j)
dic = {'Actual':actual,
'Prediction':predcition
}
result = pd.DataFrame(dic)
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_test,
mode='markers+lines',
name='Test'))
fig.add_trace(go.Scatter(x=np.arange(0,len(y_test)), y=y_pred,
mode='markers',
name='Pred'))
from sklearn.tree import DecisionTreeClassifier
max_accuracy = 0
for x in range(200):
dt = DecisionTreeClassifier(random_state=x)
dt.fit(X_train,Y_train)
Y_pred_dt = dt.predict(X_test)
current_accuracy = round(accuracy_score(Y_pred_dt,Y_test)*100,2)
if(current_accuracy>max_accuracy):
max_accuracy = current_accuracy
best_x = x
dt = DecisionTreeClassifier(random_state=best_x)
dt.fit(X_train,Y_train)
Y_pred_dt = dt.predict(X_test)
accuracy_score_DT = accuracy_score(Y_pred_dt,Y_test)*100
print(accuracy_score_DT)
80.32786885245902
from sklearn import svm
SV = svm.SVC(kernel='linear')
SV.fit(X_train, Y_train)
Y_pred_SVM = SV.predict(X_test)
accuracy_score_SVM = accuracy_score(Y_pred_SVM,Y_test)*100
print(accuracy_score_SVM)
75.40983606557377
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(X_train,Y_train)
Y_pred_NB = NB.predict(X_test)
accuracy_score_NB = accuracy_score(Y_pred_NB,Y_test)*100
print(accuracy_score_NB)
67.21311475409836
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))
0.4918032786885246
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
precision recall f1-score support
0 0.43 0.32 0.37 28
1 0.53 0.64 0.58 33
accuracy 0.49 61
macro avg 0.48 0.48 0.47 61
weighted avg 0.48 0.49 0.48 61
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)
[[ 9 19] [12 21]]
<AxesSubplot:>
Scores=[accuracy_score_logre,accuracy_score_NB, accuracy_score_SVM,accuracy_score_DT]
algorithms=["Logistic Regression","Naive Bayes","Support Vector Machine","Decision Tree"]
for i in range(len(algorithms)):
print(algorithms[i],';',Scores[i],'\n')
Logistic Regression ; 70.49180327868852 Naive Bayes ; 67.21311475409836 Support Vector Machine ; 75.40983606557377 Decision Tree ; 80.32786885245902
clf = LogisticRegression(random_state=0, multi_class='ovr')
new_observation = [[1,1,1,1,0,1,1,0,1,1,0,0,0,1]]
logre.predict(X_test)
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1], dtype=int64)
data.columns
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 'Sex_S',
'Target_S'],
dtype='object')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
X_train.shape
(242, 13)
y_train.shape # AB
(242,)
X_test.shape
(61, 13)
y_test.shape
(61,)
X_train.head()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 184 | 50 | 1 | 0 | 150 | 243 | 0 | 0 | 128 | 0 | 2.6 | 1 | 0 | 3 |
| 19 | 69 | 0 | 3 | 140 | 239 | 0 | 1 | 151 | 0 | 1.8 | 2 | 2 | 2 |
| 118 | 46 | 0 | 1 | 105 | 204 | 0 | 1 | 172 | 0 | 0.0 | 2 | 0 | 2 |
| 41 | 48 | 1 | 1 | 130 | 245 | 0 | 0 | 180 | 0 | 0.2 | 1 | 0 | 2 |
| 59 | 57 | 0 | 0 | 128 | 303 | 0 | 0 | 159 | 0 | 0.0 | 2 | 1 | 2 |
y_train.head()
184 0 19 1 118 1 41 1 59 1 Name: target, dtype: int64
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
Feature scalling and transformation
X_train
array([[-0.49780259, 0.68964466, -0.9646892 , ..., -0.70919889,
-0.71885794, 1.1581505 ],
[ 1.64736734, -1.4500221 , 1.87079404, ..., 0.95707423,
1.29228219, -0.55082768],
[-0.94941732, -1.4500221 , -0.01952812, ..., 0.95707423,
-0.71885794, -0.55082768],
...,
[ 0.40542685, 0.68964466, 0.92563296, ..., -0.70919889,
-0.71885794, 1.1581505 ],
[-1.40103204, 0.68964466, -0.9646892 , ..., -0.70919889,
-0.71885794, -2.25980586],
[-0.27199523, 0.68964466, 0.92563296, ..., 0.95707423,
-0.71885794, 1.1581505 ]])
data.columns
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 'Sex_S',
'Target_S'],
dtype='object')
• Build a regression model to prediction • Select an appropriate regression algorithm (e.g., linear regression, ridge regression, or any of your choice). • Train the model on the training set and evaluate its performance on the testing set. • Assess model performance using relevant regression metrics (e.g., RMSE, R-squared). Visualize the model's predictions and actual values. • Provide insights into the factors that contribute most to the model.
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
LinearRegression()
regressor
LinearRegression()
regressor.coef_
array([ 0.00465239, -0.09513778, 0.11226421, -0.02614667, -0.04831201,
-0.01263307, 0.01116324, 0.0637316 , -0.04771842, -0.07014892,
0.05551112, -0.09315509, -0.09568933])
y_pred = regressor.predict(X_test)
y_pred
array([ 0.96941818, 0.00301052, -0.09529432, 0.96516484, 0.67995599,
0.05506014, 0.22754127, 0.38328528, 0.6096094 , 0.58559208,
0.31665131, 0.57146363, 0.34759999, 0.59644477, 0.17510789,
0.61342744, 0.32218754, 0.33479871, 0.1483229 , 0.58285521,
0.47099595, 0.98908423, 0.38396744, 0.96299056, 1.1132148 ,
0.13804823, -0.16730756, 0.87660871, -0.09871093, -0.13626252,
1.01349615, 1.27070071, -0.08322494, 0.65596908, 0.1487065 ,
0.96681285, 0.34990931, 0.9452917 , 1.1087582 , 0.77932046,
0.46140969, 0.05445854, 0.65358252, 0.67622807, -0.12403366,
0.8871549 , 0.3184862 , 0.90540762, -0.25529644, 0.89642486,
0.62554646, 0.60015273, 0.66260979, -0.06878203, 0.90904774,
0.77314907, 0.58430808, 0.53670981, 0.79255476, 0.80890504,
0.17419823])
y_test
69 1
300 0
220 0
134 1
7 1
..
1 1
23 1
56 1
75 1
252 0
Name: target, Length: 61, dtype: int64
# Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
mean_absolute_error(y_test, y_pred)
0.26854949072442375
mean_squared_error(y_test, y_pred)
0.11486071780768763
np.sqrt(mean_squared_error(y_test, y_pred))
0.3389110765491261
r2 = r2_score(y_test, y_pred)
r2
0.5374494253653619
# Adjusted R2
n = X_test.shape[0]
n
p = X_test.shape[1]
p
num = (1-r2)*(n-1)
den = n-p-1
adj_r2 = 1- (num/den)
adj_r2
0.40950990472173854
• Create additional features that might improve the classification and regression models. Explain the rationale behind feature engineering choices.
X = data.drop('target', axis=1)
y = data['target']
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)